#
# Copyright 2002 Free Software Foundation, Inc.
# 
# This file is part of GNU Radio
# 
# GNU Radio is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2, or (at your option)
# any later version.
# 
# GNU Radio is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with GNU Radio; see the file COPYING.  If not, write to
# the Free Software Foundation, Inc., 51 Franklin Street,
# Boston, MA 02110-1301, USA.
# 


# input and taps are guarenteed to be 16 byte aligned.
# n_4_float_blocks is != 0
#	
#
#  float 
#  sse_float_dotprod (const float *input,
#                  const float *taps, unsigned n_4_float_blocks)
#  {
#    float sum0 = 0;
#    float sum1 = 0;
#    float sum2 = 0;
#    float sum3 = 0;
#  
#    do {
#  
#      sum0 += input[0] * taps[0];
#      sum1 += input[1] * taps[1];
#      sum2 += input[2] * taps[2];
#      sum3 += input[3] * taps[3];
#  
#      input += 4;
#      taps += 4;
#  
#    } while (--n_4_float_blocks != 0);
#  
#  
#    return sum0 + sum1 + sum2 + sum3;
#  }
#  		


	.file	"3dnow_float_dotprod_simple.s"
	.version	"01.01"
.text
	.p2align 4
.globl sse_float_dotprod
	.type	 sse_float_dotprod,@function
sse_float_dotprod:
	pushl	%ebp
	movl	%esp, %ebp
	movl	8(%ebp), %edx
	movl	12(%ebp), %eax
	movl	16(%ebp), %ecx

	
	# The plan is to get it computing the correct answer, and
	# then to unroll and schedule the inner loop.

	pxor	%mm4, %mm4		# mm4 = 0 0
	pxor	%mm5, %mm5		# mm5 = 0 0 

	.p2align 4
.loop1:
	movq	0(%eax), %mm0
	movq	8(%eax), %mm1
	
	pfmul	0(%edx), %mm0
	pfadd	%mm0, %mm4
	
	pfmul	8(%edx), %mm1
	pfadd	%mm1, %mm5

	addl	$16, %edx
	addl	$16, %eax
	decl	%ecx
	jne	.loop1
	
	# at this point mm4 and mm5 contain partial sums

	pfadd	%mm5, %mm4
	pfacc	%mm4, %mm4
	movd	%mm4, 16(%ebp)
	femms
	flds	16(%ebp)

	popl	%ebp
	ret
.Lfe1:
	.size	 sse_float_dotprod,.Lfe1-sse_float_dotprod
	.ident	"Hand coded x86 3DNow! assembly"
